{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "eb799929",
   "metadata": {},
   "source": [
    "## GPT for style completion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "65cd4e7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, \\\n",
    "                         Trainer, TrainingArguments\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "09b74db5",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "tokenizer = GPT2Tokenizer.from_pretrained('gpt2')  # load up a standard gpt2 model\n",
    "\n",
    "tokenizer.pad_token = tokenizer.eos_token  \n",
    "# set our pad token to be the eos token. This lets gpt know how to fill space\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "47ea384e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/sinanozdemir/opt/anaconda3/lib/python3.9/site-packages/transformers/data/datasets/language_modeling.py:54: FutureWarning: This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "# load up our data into a dataset\n",
    "pds_data = TextDataset(\n",
    "    tokenizer=tokenizer,\n",
    "    file_path='../data/PDS2.txt',  # Principles of Data Science - Sinan Ozdemir\n",
    "    block_size=64  # length of each chunk of text to use as a datapoint\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "fe52e42c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(tensor([  200, 47231,  6418,   286,  6060,  5800,   198, 12211,  5061,   198,\n",
       "           198,    32, 31516,   338,  5698,   284, 13905,  7605,   290,  4583,\n",
       "           284,   198, 11249,   304,   171,   105,   222, 13967,  1366,    12,\n",
       "         15808,  5479,   198,   198, 46200,   272, 18024,  9536,   343,   198,\n",
       "         16012,   346, 31250,   671,   198,   198,  3483, 29138,  2751, 33363,\n",
       "           532,   337,  5883,  4339,    40,   628,   200, 47231,  6418,   286,\n",
       "          6060,  5800,   198, 12211]),\n",
       " torch.Size([64]))"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pds_data[0], pds_data[0].shape  # inspect the first point"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 255,
   "id": "32b488fa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\f",
      "Principles of Data Science\n",
      "Second Edition\n",
      "\n",
      "A beginner's guide to statistical techniques and theory to\n",
      "build eﬀective data-driven applications\n",
      "\n",
      "Sinan Ozdemir\n",
      "Sunil Kakade\n",
      "\n",
      "BIRMINGHAM - MUMBAI\n",
      "\n",
      "\f",
      "Principles of Data Science\n",
      "Second\n"
     ]
    }
   ],
   "source": [
    "print(tokenizer.decode(pds_data[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "6914d0d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_collator = DataCollatorForLanguageModeling(\n",
    "    tokenizer=tokenizer, mlm=False,  \n",
    "    # MLM is Masked Language Modelling (for BERT + auto-encoding tasks)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "083de66d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': tensor([[   40,   716,   281,  5128],\n",
       "        [ 2396,   716,   314, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1],\n",
       "        [1, 1, 1, 0]]), 'labels': tensor([[  40,  716,  281, 5128],\n",
       "        [2396,  716,  314, -100]])}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# example of how collator pads data dynamically\n",
    "collator_example = data_collator([tokenizer('I am an input'), tokenizer('So am I')])\n",
    "\n",
    "collator_example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "395848aa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[   40,   716,   281,  5128],\n",
       "        [ 2396,   716,   314, 50256]])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "collator_example.input_ids  # 50256 is our pad token id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "cb23c58b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "50256"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.pad_token_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "b6f0c2ea",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[1, 1, 1, 1],\n",
       "        [1, 1, 1, 0]])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "collator_example.attention_mask  # Note the 0 in the attention mask where we have a pad token"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "59808749",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[  40,  716,  281, 5128],\n",
       "        [2396,  716,  314, -100]])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "collator_example.labels  # note the -100 to ignore loss calculation for the padded token\n",
    "# Labels are shifted inside the GPT model so we don't need to worry about that"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b9701db",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f65c372b",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "model = GPT2LMHeadModel.from_pretrained('gpt2')  # load up a GPT2 model\n",
    "\n",
    "pretrained_generator = pipeline(  # create a generator with built in params\n",
    "    'text-generation', model=model, tokenizer='gpt2',\n",
    "    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "id": "18101335",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "----------\n",
      "This dataset shows the relationship between the number of years that a student has left the room.\n",
      "In this blog post, we will look at how to use this data to create user profiles for other departments\n",
      "based on their use of KPI.\n",
      "\n",
      "----------\n",
      "This dataset shows the relationship between\n",
      "weighting and each value for the model\n",
      "Let's look at some basic relationships to predict the probability of a certain type of event:\n",
      "data['id'] = z_test.mean()\n",
      "# predict a\n",
      "----------\n",
      "This dataset shows the relationship between gender (as shown by the bar chart) and each major quantitative measure of medical attention\n",
      "about the population:\n",
      "\n",
      "[ 463 ]\n",
      "\n",
      "\f",
      "Basic Statistics\n",
      "\n",
      "Chapter 18\n",
      "\n",
      "Now let's look at statistics\n",
      "----------\n"
     ]
    }
   ],
   "source": [
    "print('----------')\n",
    "for generated_sequence in pretrained_generator('This dataset shows the relationship', num_return_sequences=3):\n",
    "    print(generated_sequence['generated_text'])\n",
    "    print('----------')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "0f775e4c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "***** Running Evaluation *****\n",
      "  Num examples = 470\n",
      "  Batch size = 32\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='30' max='15' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [15/15 47:47]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'eval_loss': 4.5039801597595215,\n",
       " 'eval_runtime': 194.2009,\n",
       " 'eval_samples_per_second': 2.42,\n",
       " 'eval_steps_per_second': 0.077}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training_args = TrainingArguments(\n",
    "    output_dir=\"./gpt2_pds\", #The output directory\n",
    "    overwrite_output_dir=True, #overwrite the content of the output directory\n",
    "    num_train_epochs=3, # number of training epochs\n",
    "    per_device_train_batch_size=32, # batch size for training\n",
    "    per_device_eval_batch_size=32,  # batch size for evaluation\n",
    "    logging_steps=10,\n",
    "    load_best_model_at_end=True,\n",
    "    evaluation_strategy='epoch',\n",
    "    save_strategy='epoch'\n",
    ")\n",
    "\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    data_collator=data_collator,\n",
    "    train_dataset=pds_data.examples[:int(len(pds_data.examples)*.8)],\n",
    "    eval_dataset=pds_data.examples[int(len(pds_data.examples)*.8):]\n",
    ")\n",
    "\n",
    "trainer.evaluate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "5a280a0e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/sinanozdemir/opt/anaconda3/lib/python3.9/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
      "  warnings.warn(\n",
      "***** Running training *****\n",
      "  Num examples = 1878\n",
      "  Num Epochs = 3\n",
      "  Instantaneous batch size per device = 32\n",
      "  Total train batch size (w. parallel, distributed & accumulation) = 32\n",
      "  Gradient Accumulation steps = 1\n",
      "  Total optimization steps = 177\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='177' max='177' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [177/177 2:12:56, Epoch 3/3]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>3.316500</td>\n",
       "      <td>3.481012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>3.056100</td>\n",
       "      <td>3.446980</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>2.896000</td>\n",
       "      <td>3.451081</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "***** Running Evaluation *****\n",
      "  Num examples = 470\n",
      "  Batch size = 32\n",
      "Saving model checkpoint to ./gpt2_pds/checkpoint-59\n",
      "Configuration saved in ./gpt2_pds/checkpoint-59/config.json\n",
      "Model weights saved in ./gpt2_pds/checkpoint-59/pytorch_model.bin\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 470\n",
      "  Batch size = 32\n",
      "Saving model checkpoint to ./gpt2_pds/checkpoint-118\n",
      "Configuration saved in ./gpt2_pds/checkpoint-118/config.json\n",
      "Model weights saved in ./gpt2_pds/checkpoint-118/pytorch_model.bin\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 470\n",
      "  Batch size = 32\n",
      "Saving model checkpoint to ./gpt2_pds/checkpoint-177\n",
      "Configuration saved in ./gpt2_pds/checkpoint-177/config.json\n",
      "Model weights saved in ./gpt2_pds/checkpoint-177/pytorch_model.bin\n",
      "\n",
      "\n",
      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
      "\n",
      "\n",
      "Loading best model from ./gpt2_pds/checkpoint-118 (score: 3.4469802379608154).\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=177, training_loss=3.169361955028469, metrics={'train_runtime': 8020.7923, 'train_samples_per_second': 0.702, 'train_steps_per_second': 0.022, 'total_flos': 184014913536000.0, 'train_loss': 3.169361955028469, 'epoch': 3.0})"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "f477e12c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "***** Running Evaluation *****\n",
      "  Num examples = 470\n",
      "  Batch size = 32\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='15' max='15' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [15/15 02:53]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'eval_loss': 3.4469802379608154,\n",
       " 'eval_runtime': 186.6424,\n",
       " 'eval_samples_per_second': 2.518,\n",
       " 'eval_steps_per_second': 0.08,\n",
       " 'epoch': 3.0}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.evaluate()  # loss decrease is slowing down so we are hitting our limit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "b2f1abe6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Saving model checkpoint to ./gpt2_pds\n",
      "Configuration saved in ./gpt2_pds/config.json\n",
      "Model weights saved in ./gpt2_pds/pytorch_model.bin\n"
     ]
    }
   ],
   "source": [
    "trainer.save_model()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "9079151c",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "loading configuration file ./gpt2_pds/config.json\n",
      "Model config GPT2Config {\n",
      "  \"_name_or_path\": \"gpt2\",\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"do_sample\": true,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"max_length\": 50,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"torch_dtype\": \"float32\",\n",
      "  \"transformers_version\": \"4.19.4\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "loading weights file ./gpt2_pds/pytorch_model.bin\n",
      "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n",
      "\n",
      "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at ./gpt2_pds.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n"
     ]
    }
   ],
   "source": [
    "loaded_model = GPT2LMHeadModel.from_pretrained('./gpt2_pds')\n",
    "\n",
    "finetuned_generator = pipeline(\n",
    "    'text-generation', model=loaded_model, tokenizer=tokenizer,\n",
    "    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "id": "60aee57d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "----------\n",
      "This dataset shows the relationship between education and age.\n",
      "\n",
      "This data was obtained from\n",
      "pf2scs.kdf using the KdfTree method of classification, which is cross-validated with Python and the\n",
      "PdfFrame utility\n",
      "----------\n",
      "This dataset shows the relationship between COS and our sample distribution, we can easily see on the graph:\n",
      "This leads us to the next big thing: we can visualize variables as a single line graph. This graph has a cross-sectional length of\n",
      "----------\n",
      "This dataset shows the relationship between\n",
      "height and length of the data set:\n",
      "\n",
      "(b_height = height[1] == 0)\n",
      "So, that in our case is about 4200\n",
      "people in height, which means that\n",
      "I\n",
      "----------\n"
     ]
    }
   ],
   "source": [
    "# examples are now sustainably about data\n",
    "print('----------')\n",
    "for generated_sequence in finetuned_generator('This dataset shows the relationship', num_return_sequences=3):\n",
    "    print(generated_sequence['generated_text'])\n",
    "    print('----------')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae0a3279",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "282bee9d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "e044664f",
   "metadata": {},
   "source": [
    "## GPT for code dictation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ebb28b49",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling, TrainingArguments, Trainer, \\\n",
    "                         GPT2LMHeadModel, pipeline\n",
    "from datasets import Dataset\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "cc4a8564",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(50, 2)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>English</th>\n",
       "      <th>LaTeX</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>integral from a to b of x squared</td>\n",
       "      <td>\\int_{a}^{b} x^2 \\,dx</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>integral from negative 1 to 1 of x squared</td>\n",
       "      <td>\\int_{-1}^{1} x^2 \\,dx</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      English                   LaTeX\n",
       "0           integral from a to b of x squared   \\int_{a}^{b} x^2 \\,dx\n",
       "1  integral from negative 1 to 1 of x squared  \\int_{-1}^{1} x^2 \\,dx"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.read_csv('../data/english_to_latex.csv')\n",
    "\n",
    "print(data.shape)\n",
    "\n",
    "data.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "8a166ea6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>English</th>\n",
       "      <th>LaTeX</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>integral from a to b of x squared</td>\n",
       "      <td>\\int_{a}^{b} x^2 \\,dx</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>integral from negative 1 to 1 of x squared</td>\n",
       "      <td>\\int_{-1}^{1} x^2 \\,dx</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>integral from negative 1 to infinity of x cubed</td>\n",
       "      <td>\\int_{-1}^{\\inf} x^3 \\,dx</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>integral from 0 to infinity of x squared</td>\n",
       "      <td>\\int_{0}^{\\inf} x^2 \\,dx</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>integral from 0 to infinity of y squared</td>\n",
       "      <td>\\int_{0}^{\\inf} y^2 \\,dy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>integral from 1 to 2 of x over 2</td>\n",
       "      <td>\\int_{1}^{2} \\frac{x}{2} \\,dx</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>f of x equals x squared</td>\n",
       "      <td>f(x) = x^2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>h of x equals x squared</td>\n",
       "      <td>h(x) = x^2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>g of x equals x squared</td>\n",
       "      <td>g(x) = x^2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>g of x equals x to the eighth power</td>\n",
       "      <td>g(x) = x^8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           English  \\\n",
       "0                integral from a to b of x squared   \n",
       "1       integral from negative 1 to 1 of x squared   \n",
       "2  integral from negative 1 to infinity of x cubed   \n",
       "3         integral from 0 to infinity of x squared   \n",
       "4         integral from 0 to infinity of y squared   \n",
       "5                 integral from 1 to 2 of x over 2   \n",
       "6                          f of x equals x squared   \n",
       "7                          h of x equals x squared   \n",
       "8                          g of x equals x squared   \n",
       "9              g of x equals x to the eighth power   \n",
       "\n",
       "                           LaTeX  \n",
       "0          \\int_{a}^{b} x^2 \\,dx  \n",
       "1         \\int_{-1}^{1} x^2 \\,dx  \n",
       "2      \\int_{-1}^{\\inf} x^3 \\,dx  \n",
       "3       \\int_{0}^{\\inf} x^2 \\,dx  \n",
       "4       \\int_{0}^{\\inf} y^2 \\,dy  \n",
       "5  \\int_{1}^{2} \\frac{x}{2} \\,dx  \n",
       "6                     f(x) = x^2  \n",
       "7                     h(x) = x^2  \n",
       "8                     g(x) = x^2  \n",
       "9                     g(x) = x^8  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f747b7f0",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c138752d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "bc988ea9",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "tokenizer = GPT2Tokenizer.from_pretrained('gpt2')\n",
    "\n",
    "tokenizer.pad_token = tokenizer.eos_token\n",
    "\n",
    "# Add our singular prompt\n",
    "CONVERSION_PROMPT = 'LCT\\n'  # LaTeX conversion task\n",
    "\n",
    "CONVERSION_TOKEN = 'LaTeX:'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e4aa061",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "db5ebf1e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LCT\n",
      "English: integral from a to b of x squared\n",
      "LaTeX: \\int_{a}^{b} x^2 \\,dx\n"
     ]
    }
   ],
   "source": [
    "# This is our \"training prompt\" that we want GPT2 to recognize and learn\n",
    "training_examples = f'{CONVERSION_PROMPT}English: ' + data['English'] + '\\n' + CONVERSION_TOKEN + ' ' + data['LaTeX'].astype(str)\n",
    "\n",
    "print(training_examples[0])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "beb1b918",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LCT\\nEnglish: integral from a to b of x square...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>LCT\\nEnglish: integral from negative 1 to 1 of...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text\n",
       "0  LCT\\nEnglish: integral from a to b of x square...\n",
       "1  LCT\\nEnglish: integral from negative 1 to 1 of..."
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "task_df = pd.DataFrame({'text': training_examples})\n",
    "\n",
    "task_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c98cd94a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "b7617abd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# adding the EOS token at the end so the model knows when to stop predicting\n",
    "\n",
    "task_df['text'] = task_df['text'].map(lambda x: f'{x}{tokenizer.eos_token}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76552d13",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "074754cb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "574e112410714ec1b21ae25afa1efca9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "latex_data = Dataset.from_pandas(task_df)  # turn a pandas DataFrame into a Dataset\n",
    "\n",
    "def preprocess(examples):  \n",
    "    # tokenize our text but don't pad because our collator will pad for us dynamically\n",
    "    return tokenizer(examples['text'], truncation=True)\n",
    "\n",
    "latex_data = latex_data.map(preprocess, batched=True)\n",
    "\n",
    "latex_data = latex_data.train_test_split(train_size=.8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "16bb726f",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'text': 'LCT\\nEnglish: integral from a to b of x squared\\nLaTeX: \\\\int_{a}^{b} x^2 \\\\,dx<|endoftext|>',\n",
       " 'input_ids': [43,\n",
       "  4177,\n",
       "  198,\n",
       "  15823,\n",
       "  25,\n",
       "  19287,\n",
       "  422,\n",
       "  257,\n",
       "  284,\n",
       "  275,\n",
       "  286,\n",
       "  2124,\n",
       "  44345,\n",
       "  198,\n",
       "  14772,\n",
       "  49568,\n",
       "  25,\n",
       "  3467,\n",
       "  600,\n",
       "  23330,\n",
       "  64,\n",
       "  92,\n",
       "  36796,\n",
       "  65,\n",
       "  92,\n",
       "  2124,\n",
       "  61,\n",
       "  17,\n",
       "  3467,\n",
       "  11,\n",
       "  34350,\n",
       "  50256],\n",
       " 'attention_mask': [1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1,\n",
       "  1]}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "latex_data['train'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 221,
   "id": "4f1f30f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 222,
   "id": "e44dd292",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /Users/sinanozdemir/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n",
      "Model config GPT2Config {\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.19.4\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /Users/sinanozdemir/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n",
      "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n",
      "\n",
      "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n"
     ]
    }
   ],
   "source": [
    "latex_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "0712e12d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['text', 'input_ids', 'attention_mask'],\n",
       "        num_rows: 40\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['text', 'input_ids', 'attention_mask'],\n",
       "        num_rows: 10\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "latex_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 223,
   "id": "7d7582bd",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "PyTorch: setting up devices\n",
      "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n",
      "The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 10\n",
      "  Batch size = 20\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='2' max='1' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [1/1 00:34]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'eval_loss': 4.891348361968994,\n",
       " 'eval_runtime': 2.9897,\n",
       " 'eval_samples_per_second': 3.345,\n",
       " 'eval_steps_per_second': 0.334}"
      ]
     },
     "execution_count": 223,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training_args = TrainingArguments(\n",
    "    output_dir=\"./english_to_latex\",\n",
    "    overwrite_output_dir=True, # overwrite the content of the output directory\n",
    "    num_train_epochs=5, # number of training epochs\n",
    "    per_device_train_batch_size=4, # batch size for training\n",
    "    per_device_eval_batch_size=20,  # batch size for evaluation\n",
    "    load_best_model_at_end=True,\n",
    "    logging_steps=5,\n",
    "    log_level='info',\n",
    "    evaluation_strategy='epoch',\n",
    "    save_strategy='epoch'\n",
    ")\n",
    "\n",
    "trainer = Trainer(\n",
    "    model=latex_gpt2,\n",
    "    args=training_args,\n",
    "    train_dataset=latex_data[\"train\"],\n",
    "    eval_dataset=latex_data[\"test\"],\n",
    "    data_collator=data_collator,\n",
    ")\n",
    "\n",
    "trainer.evaluate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "id": "b78c75d0",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.\n",
      "/Users/sinanozdemir/opt/anaconda3/lib/python3.9/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
      "  warnings.warn(\n",
      "***** Running training *****\n",
      "  Num examples = 40\n",
      "  Num Epochs = 5\n",
      "  Instantaneous batch size per device = 4\n",
      "  Total train batch size (w. parallel, distributed & accumulation) = 4\n",
      "  Gradient Accumulation steps = 1\n",
      "  Total optimization steps = 50\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='50' max='50' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [50/50 02:57, Epoch 5/5]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2.518400</td>\n",
       "      <td>1.832018</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>1.231000</td>\n",
       "      <td>1.002905</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.842700</td>\n",
       "      <td>0.916263</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.649000</td>\n",
       "      <td>0.909049</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>0.704200</td>\n",
       "      <td>0.881874</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 10\n",
      "  Batch size = 20\n",
      "Saving model checkpoint to ./english_to_latex/checkpoint-10\n",
      "Configuration saved in ./english_to_latex/checkpoint-10/config.json\n",
      "Model weights saved in ./english_to_latex/checkpoint-10/pytorch_model.bin\n",
      "The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 10\n",
      "  Batch size = 20\n",
      "Saving model checkpoint to ./english_to_latex/checkpoint-20\n",
      "Configuration saved in ./english_to_latex/checkpoint-20/config.json\n",
      "Model weights saved in ./english_to_latex/checkpoint-20/pytorch_model.bin\n",
      "The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 10\n",
      "  Batch size = 20\n",
      "Saving model checkpoint to ./english_to_latex/checkpoint-30\n",
      "Configuration saved in ./english_to_latex/checkpoint-30/config.json\n",
      "Model weights saved in ./english_to_latex/checkpoint-30/pytorch_model.bin\n",
      "The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 10\n",
      "  Batch size = 20\n",
      "Saving model checkpoint to ./english_to_latex/checkpoint-40\n",
      "Configuration saved in ./english_to_latex/checkpoint-40/config.json\n",
      "Model weights saved in ./english_to_latex/checkpoint-40/pytorch_model.bin\n",
      "The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 10\n",
      "  Batch size = 20\n",
      "Saving model checkpoint to ./english_to_latex/checkpoint-50\n",
      "Configuration saved in ./english_to_latex/checkpoint-50/config.json\n",
      "Model weights saved in ./english_to_latex/checkpoint-50/pytorch_model.bin\n",
      "\n",
      "\n",
      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
      "\n",
      "\n",
      "Loading best model from ./english_to_latex/checkpoint-50 (score: 0.8818739056587219).\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=50, training_loss=1.459311113357544, metrics={'train_runtime': 180.7542, 'train_samples_per_second': 1.106, 'train_steps_per_second': 0.277, 'total_flos': 3529483776000.0, 'train_loss': 1.459311113357544, 'epoch': 5.0})"
      ]
     },
     "execution_count": 224,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 225,
   "id": "d3c90137",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 10\n",
      "  Batch size = 20\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='1' max='1' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [1/1 : < :]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'eval_loss': 0.8818739056587219,\n",
       " 'eval_runtime': 2.9323,\n",
       " 'eval_samples_per_second': 3.41,\n",
       " 'eval_steps_per_second': 0.341,\n",
       " 'epoch': 5.0}"
      ]
     },
     "execution_count": 225,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.evaluate()  # best loss of 0.8818739"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6010cf42",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "id": "7e795393",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's try fine-tuning it again but first let's have the model read a math book"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "id": "173901dd",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/sinanozdemir/opt/anaconda3/lib/python3.9/site-packages/transformers/data/datasets/language_modeling.py:54: FutureWarning: This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py\n",
      "  warnings.warn(\n",
      "Creating features from dataset file at ../data\n",
      "Saving features into cached file ../data/cached_lm_GPT2Tokenizer_128_latex_cheat_sheet.tex [took 0.000 s]\n",
      "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /Users/sinanozdemir/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n",
      "Model config GPT2Config {\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.19.4\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /Users/sinanozdemir/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n",
      "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n",
      "\n",
      "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n",
      "PyTorch: setting up devices\n",
      "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n"
     ]
    }
   ],
   "source": [
    "# Linear Algebra book by Jim Hefferon written in LaTeX for free - https://joshua.smcvt.edu/linearalgebra\n",
    "\n",
    "book_data = TextDataset(\n",
    "    tokenizer=tokenizer,\n",
    "    file_path='../data/latex_cheat_sheet.tex',  # train on a LaTeX cheat sheet they made\n",
    "    block_size=128\n",
    ")\n",
    "\n",
    "data_collator = DataCollatorForLanguageModeling(\n",
    "    tokenizer=tokenizer, mlm=False,  # MLM is Masked Language Modelling\n",
    ")\n",
    "\n",
    "latex_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')\n",
    "\n",
    "training_args = TrainingArguments(\n",
    "    output_dir=\"./math_book\",\n",
    "    overwrite_output_dir=True, # overwrite the content of the output directory\n",
    "    num_train_epochs=2, # number of training epochs\n",
    "    per_device_train_batch_size=32, # batch size for training\n",
    "    per_device_eval_batch_size=32,  # batch size for evaluation\n",
    "    load_best_model_at_end=True,\n",
    "    logging_steps=1,\n",
    "    eval_steps=1,\n",
    "    evaluation_strategy='epoch',\n",
    "    save_strategy='epoch'\n",
    ")\n",
    "\n",
    "trainer = Trainer(\n",
    "    model=latex_gpt2,\n",
    "    args=training_args,\n",
    "    data_collator=data_collator,\n",
    "    train_dataset=book_data.examples[:int(len(book_data.examples)*.8)],\n",
    "    eval_dataset=book_data.examples[int(len(book_data.examples)*.8):]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "id": "e466aa20",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "***** Running Evaluation *****\n",
      "  Num examples = 21\n",
      "  Batch size = 32\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='2' max='1' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [1/1 04:05]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'eval_loss': 3.315159320831299,\n",
       " 'eval_runtime': 18.102,\n",
       " 'eval_samples_per_second': 1.16,\n",
       " 'eval_steps_per_second': 0.055}"
      ]
     },
     "execution_count": 166,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.evaluate()  # initial loss for the math book"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "id": "388afa38",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/sinanozdemir/opt/anaconda3/lib/python3.9/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
      "  warnings.warn(\n",
      "***** Running training *****\n",
      "  Num examples = 80\n",
      "  Num Epochs = 2\n",
      "  Instantaneous batch size per device = 32\n",
      "  Total train batch size (w. parallel, distributed & accumulation) = 32\n",
      "  Gradient Accumulation steps = 1\n",
      "  Total optimization steps = 6\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='6' max='6' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [6/6 07:29, Epoch 2/2]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2.948300</td>\n",
       "      <td>2.800811</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>2.525500</td>\n",
       "      <td>2.687646</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "***** Running Evaluation *****\n",
      "  Num examples = 21\n",
      "  Batch size = 32\n",
      "Saving model checkpoint to ./math_book/checkpoint-3\n",
      "Configuration saved in ./math_book/checkpoint-3/config.json\n",
      "Model weights saved in ./math_book/checkpoint-3/pytorch_model.bin\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 21\n",
      "  Batch size = 32\n",
      "Saving model checkpoint to ./math_book/checkpoint-6\n",
      "Configuration saved in ./math_book/checkpoint-6/config.json\n",
      "Model weights saved in ./math_book/checkpoint-6/pytorch_model.bin\n",
      "\n",
      "\n",
      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
      "\n",
      "\n",
      "Loading best model from ./math_book/checkpoint-6 (score: 2.6876461505889893).\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=6, training_loss=2.669228434562683, metrics={'train_runtime': 537.9366, 'train_samples_per_second': 0.297, 'train_steps_per_second': 0.011, 'total_flos': 10451681280000.0, 'train_loss': 2.669228434562683, 'epoch': 2.0})"
      ]
     },
     "execution_count": 167,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "id": "f742df82",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Saving model checkpoint to ./math_book\n",
      "Configuration saved in ./math_book/config.json\n",
      "Model weights saved in ./math_book/pytorch_model.bin\n"
     ]
    }
   ],
   "source": [
    "trainer.save_model()  # 2 epochs led to a pretty good drop in loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36706f17",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 226,
   "id": "0c820b1b",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "loading configuration file ./math_book/config.json\n",
      "Model config GPT2Config {\n",
      "  \"_name_or_path\": \"gpt2\",\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"torch_dtype\": \"float32\",\n",
      "  \"transformers_version\": \"4.19.4\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "loading weights file ./math_book/pytorch_model.bin\n",
      "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n",
      "\n",
      "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at ./math_book.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n",
      "PyTorch: setting up devices\n",
      "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n",
      "The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 10\n",
      "  Batch size = 20\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='2' max='1' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [1/1 00:34]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'eval_loss': 4.376880168914795,\n",
       " 'eval_runtime': 3.0114,\n",
       " 'eval_samples_per_second': 3.321,\n",
       " 'eval_steps_per_second': 0.332}"
      ]
     },
     "execution_count": 226,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "math_latex_gpt2 = GPT2LMHeadModel.from_pretrained('./math_book')  # load up our gpt pre-trained\n",
    "\n",
    "training_args = TrainingArguments(\n",
    "    output_dir=\"./math_english_to_latex\",\n",
    "    overwrite_output_dir=True, #overwrite the content of the output directory\n",
    "    num_train_epochs=5, # number of training epochs\n",
    "    per_device_train_batch_size=4, # batch size for training\n",
    "    per_device_eval_batch_size=20,  # batch size for evaluation\n",
    "    load_best_model_at_end=True,\n",
    "    logging_steps=5,\n",
    "    log_level='info',\n",
    "    evaluation_strategy='epoch',\n",
    "    save_strategy='epoch'\n",
    ")\n",
    "\n",
    "trainer = Trainer(\n",
    "    model=math_latex_gpt2,\n",
    "    args=training_args,\n",
    "    train_dataset=latex_data[\"train\"],\n",
    "    eval_dataset=latex_data[\"test\"],\n",
    "    data_collator=data_collator,\n",
    ")\n",
    "\n",
    "trainer.evaluate()  # loss is starting slightly lower than before"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 227,
   "id": "67a53cd6",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.\n",
      "/Users/sinanozdemir/opt/anaconda3/lib/python3.9/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
      "  warnings.warn(\n",
      "***** Running training *****\n",
      "  Num examples = 40\n",
      "  Num Epochs = 5\n",
      "  Instantaneous batch size per device = 4\n",
      "  Total train batch size (w. parallel, distributed & accumulation) = 4\n",
      "  Gradient Accumulation steps = 1\n",
      "  Total optimization steps = 50\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='50' max='50' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [50/50 02:54, Epoch 5/5]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2.311200</td>\n",
       "      <td>1.584350</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>1.156600</td>\n",
       "      <td>0.947031</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.774600</td>\n",
       "      <td>0.905974</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.615500</td>\n",
       "      <td>0.887093</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>0.669800</td>\n",
       "      <td>0.867295</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 10\n",
      "  Batch size = 20\n",
      "Saving model checkpoint to ./math_english_to_latex/checkpoint-10\n",
      "Configuration saved in ./math_english_to_latex/checkpoint-10/config.json\n",
      "Model weights saved in ./math_english_to_latex/checkpoint-10/pytorch_model.bin\n",
      "The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 10\n",
      "  Batch size = 20\n",
      "Saving model checkpoint to ./math_english_to_latex/checkpoint-20\n",
      "Configuration saved in ./math_english_to_latex/checkpoint-20/config.json\n",
      "Model weights saved in ./math_english_to_latex/checkpoint-20/pytorch_model.bin\n",
      "The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 10\n",
      "  Batch size = 20\n",
      "Saving model checkpoint to ./math_english_to_latex/checkpoint-30\n",
      "Configuration saved in ./math_english_to_latex/checkpoint-30/config.json\n",
      "Model weights saved in ./math_english_to_latex/checkpoint-30/pytorch_model.bin\n",
      "The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 10\n",
      "  Batch size = 20\n",
      "Saving model checkpoint to ./math_english_to_latex/checkpoint-40\n",
      "Configuration saved in ./math_english_to_latex/checkpoint-40/config.json\n",
      "Model weights saved in ./math_english_to_latex/checkpoint-40/pytorch_model.bin\n",
      "The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 10\n",
      "  Batch size = 20\n",
      "Saving model checkpoint to ./math_english_to_latex/checkpoint-50\n",
      "Configuration saved in ./math_english_to_latex/checkpoint-50/config.json\n",
      "Model weights saved in ./math_english_to_latex/checkpoint-50/pytorch_model.bin\n",
      "\n",
      "\n",
      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
      "\n",
      "\n",
      "Loading best model from ./math_english_to_latex/checkpoint-50 (score: 0.8672950863838196).\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=50, training_loss=1.3503571081161498, metrics={'train_runtime': 178.071, 'train_samples_per_second': 1.123, 'train_steps_per_second': 0.281, 'total_flos': 3529483776000.0, 'train_loss': 1.3503571081161498, 'epoch': 5.0})"
      ]
     },
     "execution_count": 227,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 228,
   "id": "a264d92c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 10\n",
      "  Batch size = 20\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='1' max='1' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [1/1 : < :]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'eval_loss': 0.8672950863838196,\n",
       " 'eval_runtime': 2.9253,\n",
       " 'eval_samples_per_second': 3.418,\n",
       " 'eval_steps_per_second': 0.342,\n",
       " 'epoch': 5.0}"
      ]
     },
     "execution_count": 228,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.evaluate()  # pre-training on the book for one epoch led to a minor drop in loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 229,
   "id": "530c78c5",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Saving model checkpoint to ./math_english_to_latex\n",
      "Configuration saved in ./math_english_to_latex/config.json\n",
      "Model weights saved in ./math_english_to_latex/pytorch_model.bin\n"
     ]
    }
   ],
   "source": [
    "trainer.save_model()  # save this model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13153c32",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 230,
   "id": "c34b7af3",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "loading configuration file ./math_english_to_latex/config.json\n",
      "Model config GPT2Config {\n",
      "  \"_name_or_path\": \"./math_book\",\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"torch_dtype\": \"float32\",\n",
      "  \"transformers_version\": \"4.19.4\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "loading weights file ./math_english_to_latex/pytorch_model.bin\n",
      "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n",
      "\n",
      "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at ./math_english_to_latex.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n"
     ]
    }
   ],
   "source": [
    "loaded_model = GPT2LMHeadModel.from_pretrained('./math_english_to_latex')\n",
    "latex_generator = pipeline('text-generation', model=loaded_model, tokenizer=tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 241,
   "id": "759e9b70",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LCT\n",
      "English: g of x equals integral from 0 to 1 of x squared\n",
      "LaTeX:\n"
     ]
    }
   ],
   "source": [
    "text_sample = 'g of x equals integral from 0 to 1 of x squared'\n",
    "conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\\n{CONVERSION_TOKEN}'\n",
    "\n",
    "print(conversion_text_sample)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 242,
   "id": "2b2cb777",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LCT\n",
      "English: g of x equals integral from 0 to 1 of x squared\n",
      "LaTeX: g(x) = \\int_{0}^{1} x^2 \\,dx^\n"
     ]
    }
   ],
   "source": [
    "print(latex_generator(\n",
    "    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,\n",
    "    max_length=len(tokenizer.encode(conversion_text_sample)) + 20\n",
    ")[0]['generated_text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bfd233ed",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 257,
   "id": "46d13e88",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LCT\n",
      "English: r of x is sum from 0 to x of x squared\n",
      "LaTeX: r(x) = \\sum_{0}^{x} x^2 \\,dx^\n"
     ]
    }
   ],
   "source": [
    "# Another example\n",
    "text_sample = 'r of x is sum from 0 to x of x squared'\n",
    "conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\\n{CONVERSION_TOKEN}'\n",
    "\n",
    "print(latex_generator(\n",
    "    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,\n",
    "    max_length=len(tokenizer.encode(conversion_text_sample)) + 20\n",
    ")[0]['generated_text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af4fbaf3",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 258,
   "id": "9ac6c305",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "r of x is sum from 0 to x of x squared\n",
      "LaTeX: \\int_{0}^{x}^2 \\,dx^2 \\,dx^3 \\,dx^4\n"
     ]
    }
   ],
   "source": [
    "print(latex_generator(\n",
    "    text_sample, num_beams=5, early_stopping=True, temperature=0.7,\n",
    "    max_length=len(tokenizer.encode(conversion_text_sample)) + 20\n",
    ")[0]['generated_text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "702462a6",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f5465370",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 247,
   "id": "21567e80",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LCT\n",
      "English: pi to the 8th power\n",
      "LaTeX: \\pi^8 \\,dx^2 \\,dx^3 \\,dx^4 \\\n"
     ]
    }
   ],
   "source": [
    "# Another example\n",
    "text_sample = 'pi to the 8th power'\n",
    "conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\\n{CONVERSION_TOKEN}'\n",
    "\n",
    "print(latex_generator(\n",
    "    conversion_text_sample, num_beams=5, early_stopping=True, temperature=.7,\n",
    "    max_length=len(tokenizer.encode(conversion_text_sample)) + 20\n",
    ")[0]['generated_text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cee3fabe",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 248,
   "id": "33e121e6",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /Users/sinanozdemir/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n",
      "Model config GPT2Config {\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.19.4\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /Users/sinanozdemir/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n",
      "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n",
      "\n",
      "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n"
     ]
    }
   ],
   "source": [
    "# Sanity check that a non-finetuned model could not have done this\n",
    "non_finetuned_latex_generator = pipeline(\n",
    "    'text-generation', \n",
    "    model=GPT2LMHeadModel.from_pretrained('gpt2'),  # not fine-tuned!\n",
    "    tokenizer=tokenizer\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0185f085",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 249,
   "id": "ab316df9",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LCT\n",
      "English: f of x is sum from 0 to x of x squared\n",
      "LaTeX: f(x) = \\sum_{0}^{x} x^2 \\,dx ###\n",
      "LCT\n",
      "English: f of x equals integral from 0 to pi of x to the fourth power\n",
      "LaTeX: f(x) = \\int_{0}^{\\pi} x^4 \\,dx ###\n",
      "LCT\n",
      "English: pi to the 8th power\n",
      "LaTeX: pi to the 8th power\n",
      "LaTeX: f(x) = \\int_{0}\n"
     ]
    }
   ],
   "source": [
    "few_shot_prompt = \"\"\"LCT\n",
    "English: f of x is sum from 0 to x of x squared\n",
    "LaTeX: f(x) = \\sum_{0}^{x} x^2 \\,dx \\\n",
    "###\n",
    "LCT\n",
    "English: f of x equals integral from 0 to pi of x to the fourth power\n",
    "LaTeX: f(x) = \\int_{0}^{\\pi} x^4 \\,dx \\\n",
    "###\n",
    "LCT\n",
    "English: pi to the 8th power\n",
    "LaTeX:\"\"\"\n",
    "\n",
    "print(non_finetuned_latex_generator(\n",
    "    few_shot_prompt, num_beams=5, early_stopping=True, temperature=0.7,\n",
    "    max_length=len(tokenizer.encode(few_shot_prompt)) + 20\n",
    ")[0]['generated_text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9243a0d9",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 250,
   "id": "d1f0418c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "English to LaTeX\n",
      "English: f of x is sum from 0 to x of x squared\n",
      "LaTeX: f(x) = \\sum_{0}^{x} x^2 \\,dx ###\n",
      "LCT\n",
      "English: f of x equals integral from 0 to pi of x to the fourth power\n",
      "LaTeX: f(x) = \\int_{0}^{\\pi} x^4 \\,dx ###\n",
      "LCT\n",
      "English: x to the eighth power\n",
      "LaTeX: f(x) = \\int_{0}^{\\pi} x^4 \\,dx\n"
     ]
    }
   ],
   "source": [
    "# try another prompt\n",
    "few_shot_prompt = \"\"\"English to LaTeX\n",
    "English: f of x is sum from 0 to x of x squared\n",
    "LaTeX: f(x) = \\sum_{0}^{x} x^2 \\,dx \\\n",
    "###\n",
    "LCT\n",
    "English: f of x equals integral from 0 to pi of x to the fourth power\n",
    "LaTeX: f(x) = \\int_{0}^{\\pi} x^4 \\,dx \\\n",
    "###\n",
    "LCT\n",
    "English: x to the eighth power\n",
    "LaTeX:\"\"\"\n",
    "\n",
    "print(non_finetuned_latex_generator(\n",
    "    few_shot_prompt, num_beams=5, early_stopping=True, temperature=0.7,\n",
    "    max_length=len(tokenizer.encode(few_shot_prompt)) + 20\n",
    ")[0]['generated_text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29c969b2",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 251,
   "id": "c87ac747",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LCT\n",
      "English: pi to the 8th power\n",
      "LaTeX: pi to the 8th power\n",
      "\n",
      "LaTeX: pi to the 8th power\n",
      "\n",
      "La\n"
     ]
    }
   ],
   "source": [
    "print(non_finetuned_latex_generator(\n",
    "    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,\n",
    "    max_length=len(tokenizer.encode(conversion_text_sample)) + 20\n",
    ")[0]['generated_text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87615fb5",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f280e57b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
